library(knitr)
library(ggplot2)
library(tools)
source("util_align.R")
## Warning: package 'polyester' was built under R version 3.4.3
## Warning: package 'seqinr' was built under R version 3.4.4
## Warning: package 'Biostrings' was built under R version 3.4.1
## Warning: package 'BiocGenerics' was built under R version 3.4.1
source("util_de.R")
##
## Attaching package: 'biomaRt'
## The following object is masked from 'package:seqinr':
##
## getSequence
## Warning: package 'limma' was built under R version 3.4.2
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
## The following object is masked from 'package:seqinr':
##
## zscore
## Warning: package 'VennDiagram' was built under R version 3.4.4
## Loading required package: grid
## Loading required package: futile.logger
To run all the simulations in this notebook, set runSimulation=TRUE To run the entire pipeline, set runPipeline=TRUE
runSimulation=FALSE
runPipeline=FALSE
This section automatically generates lists of differentially expressed genes after specifying user inputs. For the interpretation and validation of DE, see this section Before running the code, check the installation of required packages and add the path of each package to environment. If using Windows, Cygwin or Linux Subsystem for Windows 10 is required for the shell scripts (called by system function). Salmon is not supported on Windows, as of version 1.2.0
The versions used while constructing this pipeline is listed in parentheses
FastQC(v0.11.8): http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ Salmon(0.8.2):https://salmon.readthedocs.io/en/latest/index.html (Patro et al. 2017) R packages:
Specify the following parameters before running the pipeline
outDir="Path to output directory"
## fastq files ##
dataDir_fq=c("vector of paths to directories containing all fastq files") #if there are multiple directories, each will be considered as a batch; the directory names will be used as batch name
fastq_extension=c('fastq','fq','fastq.gz','fq.gz') #add fastq file extension if not already listed
## pseudo-alignment with salmon ##
salmon_index=NA #if using pre-computed index, replace NA with path
reference_fa=NA #if not using pre-computed index, replace NA with path to the fasta of reference transcriptome
reftxpt='ensembl' #name of reference
paired_end=TRUE #if library is paired-end
if(paired_end){
#option 1: provide the min number of starting characters to identify a library pair, e.g. 9 for library01_left.fq, library01_right.fq, library02_left.fq, library01_right.fq
libID_len=NA #int or vector of length len(dataDir_fq)
#option 2: enter full path to all library pairs as shown
libPairs=list(c('path to library 1 left reads','path to library 1 right reads'),c('library 2 left reads','library 2 right reads'))
}
## DE ##
condition<-list(covariate1=c("condition1_1","condition1_2"),covariate2=c("condition2_1","condition2_2")) #list the experimental conditions; e.g. list(sex=c('female',"male"),treatment=c("high","low"))
refcondition<-list(c("covariateName","conditionName"),c("covariateName","conditionName")) #condition with respect to which DE of other samples is calculated
### example of loading transcript to gene matrx from ensembl using biomaRt; alternatively you can provide a matrix with the first column (target_id) containing transcript id and second column (ens_gene) containing gene id; this step aggregates transcription-level alignment to gene-level counts ###
ensembl <- useMart("ENSEMBL_MART_ENSEMBL",dataset="mmusculus_gene_ensembl")
ttg<-biomaRt::getBM(attributes = c("ensembl_transcript_id","ensembl_gene_id"),mart = ensembl)
ttg <- dplyr::rename(ttg, target_id = ensembl_transcript_id,ens_gene = ensembl_gene_id)
Optional adjustments to default parameters
alignmethod<-"salmon" #name of the aligner
## salmon ##
kmer_length=31
libType='A'
## DE ##
lowcountfilter=TRUE #if filter out low-count genes by CPM threshold
threshCPM_byAvgColSum=1.5/10e6 #genes passing filter have averages of replicates > avgColsum*threshCPM_byAvgColSum; avgColSum = average of sums of counts of all samples; genes passing filter for each batch are calculated separately
maxNDEG<-'all' #maximum number of genes in the output matrix
FDR<-1 #maximum false discovery rate above which genes will not be included in output
multipletesting_method<-'BH' #method to adjust for multiple testing; choose from "none", "BH", "BY" and "holm"
Initializing analysis…
## checking output directory ##
if(!dir.exists(outDir)){dir.create(outDir)}
batches<-c()
for(fqi in dataDir_fq){
fqi_name=basename(fqi)
batches<-c(batches,fqi_name)
}
contrasts<-condition[-which(condition %in% c(refcondition))] #TODO: modify this to generate a list with length=length(refcondition)
for(fqi in dataDir_fq){
fqi_name=basename(fqi)
fastqc_out=file.path(outDir,"fastQC_results",fqi_name)
dir.create(fastqc_out)
for(fq in list_files_with_exts(fqi,fastq_extension,recursive=TRUE,full.names = TRUE)){
system(paste0("fastqc -f fastq ",fq," --extract ","--outdir=",fastqc_out))
}
}
setwd(outDir)
if(is.na(salmon_index)){
system(paste0("salmon index -t ",reference_fa," -i transcripts_index_salmon --type quasi -k 31"))
salmon_index="transcripts_index_salmon"
}
if(!paired_end){
for(fqi in dataDir_fq){
fqi_name=basename(fqi)
salmon_out=file.path(outDir,paste0("salmon",reftxpt,fqi_name))
dir.create(salmon_out)
for(fq in list_files_with_exts(fqi,fastq_extension,recursive=TRUE,full.names = TRUE)){
salmon_out_i=file.path(salmon_out,file_path_sans_ext(basename(fq)))
dir.create(salmon_out_i)
system(paste0("salmon quant -i ",salmon_index," -l ",libType," -1 ",fq," -o ",salmon_out_i))
}
}
}else{
if(!is.na(libID_len)){
getLibPairs(libID_len,dataDir_fq) #TODO: implement getLibPairs in util_align
}
for(fqpair in libPairs){
fqpair_name=basename(dirname(fqpair[[1]]))
salmon_out=file.path(outDir,paste0("salmon",reftxpt,fqpair_name))
if(!dir.exists(salmon_out)){dir.create(salmon_out)}
salmon_out_i=file.path(salmon_out,file_path_sans_ext(basename(fqpair[[1]])))
dir.create(salmon_out_i)
system(paste0("salmon quant -i ",salmon_index," -l ",libType," -1 ",fqpair[[1]]," -2 ",fqpair[[2]]," -o ",salmon_out_i))
}
}
#TODO: modify util_de.R, include multiple covariates & refcondition
sourcedir<-outDir
setwd(sourcedir)
##get counts
countsdf<-countdf(alignmethod,reftxpt,batches,condition)
##low count filtering##
if(lowcountfilter==TRUE){
countsdfCPM<-filterCPM(countsdf,threshCPM_byAvgColSum)
}
savepath<-file.path(outDir,'de')
if(!dir.exists(savepath)){dir.create(savepath)}
for(c in contrasts){
deresults<-de(countsdfCPM_photo2_no20mW,c,multipletesting_method,nDEG = maxNDEG,FDR = FDR)
# drawvenn(deresults,"/Users/Xinyi/rna-seq/data/n2a_prelim/de/venn_de",refcondition,c)
genelist(deresults,savepath,paste0(c))
}
The first step in the analysis is usually to transform the fastq files into gene expression matrices. This involves aligning reads to reference genome or reference transcriptome and quantification of aligned reads. Here’s a list of choices to consider:
We used Polyester (Frazee et al. 2015) to simulate RNAseq libraries where a user generated fold-change matrix is used as the ground truth. This simulation is based on mouse mRNA reference transcriptome from UCSC (simAlign/reftxmrna_ucsc_refgene.fasta.gz, assembly GRCm38/mm10, UCSC RefSeq (refGene) table (Karolchik et al. 2004)).
To use our simulated data, please refer to simAlign/simfasta1, simAlign/simfasta2, simAlign/simfasta3
To use our framework directly, please refer to the code below and replace parameter values as needed (set eval or runSimulation to TRUE before running).
Required packages: - Polyester (Frazee et al. 2015) - seqinr - Biostrings
refPath<-"reftx/reftxmrna_ucsc_refgene.fasta.gz" #path to reference transcriptome
ngroup<-2 #number of conditions in the library
ndata<-3 #number of replicates in each condition
deprob<-0.15 #probability of differential expression
demean<-2 #mean of fold change for DE genes
savepath_fcmtx='simfcmtx' #directory to save fold change matrix
savepath_lib="simfasta" #directory to save simulated libraries
simulateLib_polyester(refPath,ngroup,ndata,deprob,savepath_fcmtx,savepath_lib)
Mapping rates between Salmon and kallisto are comparible when using ensemble and UCSC as the references.Mapping rates can be found as outputs of Salmon and Kallisto
percent error of transcript X = abs(estimated transcript count of X - actual count of X)/actual count of X * 100
We compared the impacts on two DE algorithms: DEseq2 and limma-voom. To reproduce our results, run deAgreement.R (results will be stored in simAlign/results/deagreement). This detects DE between two time points in our alpha-syn experiment (T2 and T7).
Two examples using DEseq2 and voom are shown below.
Statistical inferences of differential gene expression (DE), including the normalization procedures, usually make assumptions about typical datasets’ characteristics, such as gene-count distribution and degree of DE (Seyednasrollah, Laiho, and Elo 2015, Dillies et al. (2013), Dillies et al. (2013)). It is important to understand how robust each normalization or DGE analysis method is when its assumptions are violated.
To test the performance given datasets with distinct features, we simulated
All simulations used Splatter (Zappia, Belinda, and Alicia 2017) with dropout rate equals to zero. Fixed parameters of the empirical simulations were derived from a dataset by Bottomly et al (Bottomly et al. 2011).
We tested the following combinations of normalization and DE analysis methods:
The benchmark results can be found in testdata/normtestresults and testdata/DEtestresults
The simulated datasets can be found in testdata/simulatedData
To reproduce our results, see the next section
Choice of normalization methods had very small impact on the benchmarking results. The ROC, accuracy vs FDR, and FDR control curves are comparable for all normalizations. An example is shown for ROC
Choice of DE analysis methods has trade-off between true positive rates and false discovery rates
Requires: - splatter 1.0.3 (Zappia, Belinda, and Alicia 2017)
## enter the parameter values you want to test ##
groupCells<-c(3,9,6)
mean.shape<-c(0.4,0.8,0.6)
mean.rate<-c(0.1,0.5,0.3)
lib.loc<- c(6,16,11)
lib.scale<-c(0.15,0.5,0.35)
out.prob<-c(0.01,0.1,0.05)
de.prob<-c(0.05,0.3,0.1)
de.downProb<-c(0.25,0.75,0.5)
de.facLoc<-c(0.05,0.3,0.1)
de.facScale<-c(0.2,0.6,0.4)
bcv.common<-c(0.05,0.2,0.1)
generateSimulatedCounts(normalizationMethod,deMethod,groupCells,mean.shape,mean.rate,lib.loc,lib.scale,out.prob,de.prob,de.downProb,de.facLoc,bcv.common)
We performed RNA-seq twice on a total of 6 samples of mouse Neuro2A cell cultures (3 samples for each RNA-seq). The same protocols were used and all libraries have comparable quality. We found 2665 differentially expressed genes with FDR < 0.05 using the default settings of this pipeline. With the limited number of replicates available in a typical RNA-seq experiment, it is important to consider confounding differences caused by experimental design and handling. In the following sections, we provide some methods for validation and interpretation of DE genes.
This tests for any over or under represented annotation terms in the query list with respect to the background. Several options are listed below:
Some common choices of background include genes known to be expressed in a particular cell type of interest. A proxy for cell-type specific background can be found by using all non-zero genes:
countsmtx<-as.matrix(as.data.frame(countsdf))
background<-rownames(countsmtx)[which(rowSums(countsmtx) > 0)]
However, it is possible that this list does not cover all genes that can be expressed due to the limited experimental conditions tested.
Bottomly, Daniel, Nicole A. R. Walter, Jessica Ezzell Hunter, Priscila Darakjian, Sunita Kawane, Kari J. Buck, Robert P. Searles, Michael Mooney, Shannon K. McWeeney, and Robert Hitzemann. 2011. “Evaluating Gene Expression in C57BL/6J and DBA/2J Mouse Striatum Using RNA-Seq and Microarrays.” PloS One 6 (3): e17820. doi:10.1371/journal.pone.0017820.
Dillies, Marie-Agnès, Andrea Rau, Julie Aubert, Christelle Hennequet-Antier, Marine Jeanmougin, Nicolas Servant, Céline Keime, et al. 2013. “A Comprehensive Evaluation of Normalization Methods for Illumina High-Throughput RNA Sequencing Data Analysis.” Briefings in Bioinformatics 14 (6): 671–83. doi:10.1093/bib/bbs046.
Frazee, Alyssa C., Andrew E. Jaffe, Ben Langmead, and Jeffrey T. Leek. 2015. “Polyester: Simulating RNA-Seq Datasets with Differential Transcript Expression.” Bioinformatics 31 (17): 2778–84. doi:10.1093/bioinformatics/btv272.
Huang, Da Wei, Brad T. Sherman, and Richard A. Lempicki. 2009. “Systematic and Integrative Analysis of Large Gene Lists Using DAVID Bioinformatics Resources.” Nature Protocols 4 (1): 44–57. doi:10.1038/nprot.2008.211.
Karolchik, Donna, Angela S. Hinrichs, Terrence S. Furey, Krishna M. Roskin, Charles W. Sugnet, David Haussler, and W. James Kent. 2004. “The UCSC Table Browser Data Retrieval Tool.” Nucleic Acids Research 32 (Database issue): D493–496. doi:10.1093/nar/gkh103.
Mi, Huaiyu, Anushya Muruganujan, Xiaosong Huang, Dustin Ebert, Caitlin Mills, Xinyu Guo, and Paul D. Thomas. 2019. “Protocol Update for Large-Scale Genome and Gene Function Analysis with the PANTHER Classification System (V.14.0).” Nature Protocols 14 (3): 703–21. doi:10.1038/s41596-019-0128-8.
Patro, Rob, Geet Duggal, Michael I. Love, Rafael A. Irizarry, and Carl Kingsford. 2017. “Salmon Provides Fast and Bias-Aware Quantification of Transcript Expression.” Nature Methods 14 (4): 417. doi:10.1038/nmeth.4197.
Ritchie, Matthew E., Belinda Phipson, Di Wu, Yifang Hu, Charity W. Law, Wei Shi, and Gordon K. Smyth. 2015. “Limma Powers Differential Expression Analyses for RNA-Sequencing and Microarray Studies.” Nucleic Acids Research 43 (7): e47–e47. doi:10.1093/nar/gkv007.
Seyednasrollah, Fatemeh, Asta Laiho, and Laura L. Elo. 2015. “Comparison of Software Packages for Detecting Differential Expression in RNA-Seq Studies.” Briefings in Bioinformatics 16 (1): 59–70. doi:10.1093/bib/bbt086.
Zappia, Luke, Phipson Belinda, and Oshlack Alicia. 2017. “Splatter: Simulation of Single-Cell RNA Sequencing Data Genome Biology Full Text.” Genome Biology 18 (174). https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1305-0.
Zhao, Shanrong, and Baohong Zhang. 2017. “A Comprehensive Evaluation of Ensembl, RefSeq, and UCSC Annotations in the Context of RNA-Seq Read Mapping and Gene Quantification BMC Genomics Full Text.” Accessed December 18. https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-015-1308-8.